
************************************
*********** PROJECT INFO ***********
************************************

// Record Linkage for Character-Based Names
// Core ABE matches - Chinese
// Author: Hannah Postel
// Date: 10/26/2022

frames reset

**************************************
************** STEP ZERO *************
*********** BASELINE MATCH ***********
**************************************

// Set up ABE matching algorithm
	global A "$data/1880_chn_clean.dta" 				
	global B "$data/1900_chn_clean.dta"	
	global match_vars namefrst namelast // census names as written
	global timediff = (1900 - 1880)
	
// Standard ABE match. N=2,418.
	frame change default
	clear
	cd "$data"
	abematch $match_vars, file_A($A) file_B($B) timevar(age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(histid_1880) keep_B(histid_1900)

	drop timediff* age
	ren *_A *
	ren *_B *
	gen step = "0" // identify these as base matches
	ren * _*
	frame copy default matches, replace // creating the 'match' frame
	frame matches: gen _name1="."
	frame matches: gen _name2="."
	frame matches: gen _name3="."
	frame matches: gen _name4="."
	
	
***************************************
*************** STEP ONE **************
********** POST-SEGMENTATION **********
***************************************

// Load segmented names (see name-clean dofiles)
	frame change default
	global A "$data/1880_chn_finalnames.dta"				
	global B "$data/1900_chn_finalnames.dta"	
	global match_vars name1 name2 name3 name4

// Standard ABE match with segmented names.
	clear
	abematch $match_vars, file_A($A) file_B($B) timevar(age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(histid_1880) keep_B(histid_1900)
	drop timediff* age
	ren *_A *
	ren *_B *
	gen namefrst="."
	gen namelast="."
	gen step = "1" // identify this came after segmentation
	ren * _*

// Merge and identify duplicates between steps
// Prioritize step zero matches (they matched before any cleaning)
// Drop duplicates by both years' histid's. Merge vars will show non-unique matches.
// Final result: N=4,643 (n=2,225 from segmentation step)

	frameappend matches
	bys _histid_1880: gen dupes = _N
	drop if dupes==2 & _step=="1" // n=1604
	drop dupes
	bys _histid_1900: gen dupes = _N
	drop if dupes==2 & _step=="1" // n=124
	drop dupes
	frame copy default matches, replace

// Create working dataset for continued use
// Remove those already matched from these base data
	
   *1880*
	frame change default
	use "$data/1880_chn_finalnames.dta", clear
	ren * _*
	frlink 1:1 _histid_1880, frame(matches)
	keep if matches==. // N=92,786
	frame copy default f1880, replace
	frame f1880: keep _name1 _name2 _name3 _name4 _histid_1880 _age

   *1900*
   	frame change default
	use "$data/1900_chn_finalnames.dta", clear
	ren * _*
	frlink 1:1 _histid_1900, frame(matches)
	keep if matches==. // N=71,806
	frame copy default f1900, replace
	frame f1900: keep _name1 _name2 _name3 _name4 _histid_1900 _age

	
************************************* 
************* STEP TWO **************
********** SWAP NAME ORDER **********
*************************************

// These people match after flipping the name order.
// Here I match first-to-last and last-to-first.
// Each length name done separately

// Two-part names

 * Create subset datasets with just two-part names
	frame copy f1880 f1880_2, replace
	frame f1880_2: keep if _name3=="." // 90,038

	frame copy f1900 f1900_2, replace
	frame f1900_2: keep if _name3=="." // 62,588

// Change the name order in ONE of these working datasets
// Convert the old name1 to name2 and vice versa
	frame f1900_2: gen _name1_swap = _name2
	frame f1900_2: replace _name2 = _name1
	frame f1900_2: drop _name1
	frame f1900_2: ren _name1_swap _name1

// Save working data
// abematch command requires saved files
	frame change f1880_2
	save "$working/f1880_2.dta", replace
	frame change f1900_2
	save "$working/f1900_2.dta", replace
	frame change default

// Standard ABE match with swapped two-part names. N=2,193.
	global A "$working/f1880_2.dta"			
	global B "$working/f1900_2.dta"
	global match_vars _name1 _name2 // two-part names only
	clear
	abematch $match_vars, file_A($A) file_B($B) timevar(_age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(_histid_1880) keep_B(_histid_1900)
	drop timediff* _age
	ren _*_A _*
	ren _*_B _*
	gen _namefrst="."
	gen _namelast="."
	gen _step = "2.1" // identify this came after swapping 2-part name order.

// Add to match dataset. N=6,835
	frameappend matches
	frame copy default matches, replace

// Remove step 2.1 matches from OVERALL working data
	frame change f1880
	frlink 1:1 _histid_1880, frame(matches)
	keep if matches==. // 90,593 remaining
	drop matches

	frame change f1900
	frlink 1:1 _histid_1900, frame(matches)
	keep if matches==. // 69,613 remaining
	drop matches


// THREE-PART NAMES

 * Create subset datasets with just three-part names
	frame change default
	frame copy f1880 f1880_3, replace
	frame f1880_3: keep if _name3!="." & _name4=="." // 2,730
	frame copy f1900 f1900_3, replace
	frame f1900_3: keep if _name3!="." & _name4=="." // 9,064
	
// Change the name order in ONE of these working datasets
// name1 and name2 stick together - "given" name
// name1 -> name3; name2 -> name1; name3 -> name2
// E.g. Deng Xiao Ping -> Xiao Ping Deng
	frame f1900_3: gen name1_swap = _name1
	frame f1900_3: replace _name1 = _name2
	frame f1900_3: replace _name2 = _name3
	frame f1900_3: replace _name3 = name1_swap
	frame f1900_3: drop name1_swap

// Save working data
// abematch command requires saved files
	frame change f1880_3
	save "$working/f1880_3.dta", replace
	frame change f1900_3
	save "$working/f1900_3.dta", replace

// Standard ABE match with swapped three-part names. N=1.
	global A "$working/f1880_3.dta"			
	global B "$working/f1900_3.dta"
	global match_vars _name1 _name2 _name3 // three-part names only

	frame change default
	clear
	abematch $match_vars, file_A($A) file_B($B) timevar(_age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(_histid_1880) keep_B(_histid_1900)

	drop timediff* _age
	ren _*_A _*
	ren _*_B _*
	gen _namefrst="."
	gen _namelast="."
	gen _step = "2.2" // identify this came after 3-part name swaps
	
// Add to match dataset. N=6,836
	frameappend matches
	frame copy default matches, replace

// Remove step 2.2 matches from OVERALL working data
	frame change f1880
	frlink 1:1 _histid_1880, frame(matches)
	keep if matches==. // 90,592 remaining
	drop matches

	frame change f1900
	frlink 1:1 _histid_1900, frame(matches)
	keep if matches==. // 69,612 remaining
	drop matches

// the other name order flip option doesn't yield any matches
// E.g. Deng Xiao Ping -> Ping Deng Xiao
// name1->name2; name2->name3; name3->name1
	

************************************* 
************ STEP THREE *************
********** STANDARDIZATION **********
*************************************

// These people match after "standardizing" different Romanizations
// Essentially dimension reduction across dialects and spellings
// Will do this both in orginal name order and swapped

// NYC Exclusion Index crosswalk
  *1880
	frame copy f1880 f1880_std, replace
	frame change f1880_std
	frame f1880_std: ren _name1 name1
	merge m:1 name1 using "$data/crosswalk_final_wide.dta" // 38,929 match of 90,592
	keep if _merge!=2
	drop _merge

// Replace all name spellings with a frequently mapped character
// Can choose which threshold; replace with utf code
// Goal is to take multiple spellings and condense to a character equivalent
// Using the crosswalk as a 'ground truth' mapping with frequencies
	replace name1=char1_utf if char1_prop>=.7 & char1_prop!=. // 21,511
	save "$working/f1880_std.dta", replace
	
  *1900
	frame copy f1900 f1900_std, replace
	frame change f1900_std
	frame f1900_std: ren _name1 name1
	merge m:1 name1 using "$data/crosswalk_final_wide.dta" // 41,521 match of 69,612
	keep if _merge!=2
	drop _merge
	replace name1=char1_utf if char1_prop>=.7 & char1_prop!=. // 24,623
	save "$working/f1900_std.dta", replace

// Standard ABE match with standardized names in correct order. N=509.
	global A "$working/f1880_std.dta"			
	global B "$working/f1900_std.dta"
	global match_vars name1 _name2 _name3 _name4

	frame change default
	clear
	abematch $match_vars, file_A($A) file_B($B) timevar(_age) timediff($timediff) unique_m(2) 		unique_f(2) keep_A(_histid_1880) keep_B(_histid_1900)
	drop timediff* _age
	ren _*_A _*
	ren _*_B _*
	ren name1 _name1
	gen _namefrst="."
	gen _namelast="."
	gen _step = "3.0" // identify this came after standardization without order change
		
// Add to match dataset and save final. N=7,368
	frameappend matches
	frame copy default matches, replace
	frame change matches

	replace unique_file1 = _unique_file1 if unique_file1==.
	replace unique_file2 = _unique_file2 if unique_file2==.
	replace unique_match1 = _unique_match1 if unique_match1==.
	replace unique_match2 = _unique_match2 if unique_match2==.
	drop _unique*
	drop _name4
	ren _* *
	
	save "$data/matches 26 oct.dta", replace
		
